{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from occupationcoder.coder import coder\n",
    "from occupationcoder.utilities import utilities as utils\n",
    "utils.mg_buckets.SOC_code = utils.mg_buckets.SOC_code.astype(str)\n",
    "myCoder = coder.Coder()\n",
    "import nltk\n",
    "import string\n",
    "nltk.download('wordnet')\n",
    "nltk.download('punkt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CHANGE\n",
    "# Calls the data to classify (.csv) after the pre-processing done with the pre_class.py code\n",
    "location = '' # analysis data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "indeed = pd.read_csv(location + 'quebec.csv')\n",
    "indeed.sort_values('date_first_visible')\n",
    "print(indeed['date_first_visible'].max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Defining functions\n",
    "# function to get rid of certain characters (written by Boyan)\n",
    "def to_printable(x):\n",
    "    if isinstance(x, str):\n",
    "        return \"\".join(c if c in set(string.printable) else \"\" for c in x)\n",
    "    else:\n",
    "        return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# function to erase special characters (from Turrell's algorithm)\n",
    "select_punct = set('!\"#$%&\\()*+,-./:;<=>?@[\\\\]^_`{|}~0123456789') #only removed \"'\"\n",
    "def replace_punctuation(s):\n",
    "    \"\"\"\n",
    "    Takes string as input.\n",
    "    Removes punctuation from a string if the character is in select_punct.\n",
    "    Returns a string.\n",
    "\n",
    "   >>> replace_punctuation('sales executives/ - london')\n",
    "   'sales executives   london'\n",
    "    \"\"\"\n",
    "    for i in set(select_punct):\n",
    "        if i in s:\n",
    "            s = s.replace(i, '')\n",
    "    return s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Clean strings so that the algo runs \n",
    "indeed = indeed.dropna(subset=['job_title'])  # drop observations with empty job_title, this is the main variable for classification\n",
    "indeed.job_title = indeed.job_title.str.lower() # make all the words in lowercase\n",
    "indeed.job_title = indeed.job_title.apply(to_printable) # apply Boyan's function to get rid of certain charcters\n",
    "indeed.job_title = indeed.job_title.apply(lambda x: replace_punctuation(x)) # apply Turrell's function to get rid of punctuation marks\n",
    "\n",
    "# Rename company name job sector (to match it to Turrell's code)\n",
    "indeed.rename(columns={ u'company_name': u'job_sector'}, inplace=True)\n",
    "\n",
    "# make an empty job_description field\n",
    "indeed = indeed.assign(job_description=\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "indeed.columns # check the columns have the right names for the algortithm (job_title, job_sector, job_description)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "myCoder = coder.Coder(npartitions=20)\n",
    "df = myCoder.codedataframe(indeed)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(location+'/'+'quebec_classified.csv', encoding='utf-8')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
